{
  "nbformat": 4,
  "nbformat_minor": 0,
  "metadata": {
    "colab": {
      "private_outputs": true,
      "provenance": [],
      "authorship_tag": "ABX9TyPOm2WdJ9b2V34wNq1wr75Q",
      "include_colab_link": true
    },
    "kernelspec": {
      "name": "python3",
      "display_name": "Python 3"
    },
    "language_info": {
      "name": "python"
    }
  },
  "cells": [
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "view-in-github",
        "colab_type": "text"
      },
      "source": [
        "<a href=\"https://colab.research.google.com/github/greengerong/awesome-llm/blob/main/colab/datasete%E6%95%B0%E6%8D%AE%E9%9B%86%E5%A4%84%E7%90%86.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
      ]
    },
    {
      "cell_type": "markdown",
      "source": [
        "# 数据集下载"
      ],
      "metadata": {
        "id": "Sl9q4xFJ4Rqw"
      }
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "Uw6GaceS38jO"
      },
      "outputs": [],
      "source": [
        "!wget https://github.com/crux82/squad-it/raw/master/SQuAD_it-train.json.gz\n",
        "!wget https://github.com/crux82/squad-it/raw/master/SQuAD_it-test.json.gz\n",
        "!gzip -dkv SQuAD_it-*.json.gz"
      ]
    },
    {
      "cell_type": "markdown",
      "source": [
        "# 数据集处理"
      ],
      "metadata": {
        "id": "7NRODu0r5IKB"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "!pip install datasets transformers  huggingface-hub accelerate"
      ],
      "metadata": {
        "id": "Nw3RMNCU5dr7"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "from datasets import load_dataset\n",
        "\n",
        "squad_it_dataset = load_dataset(\"json\", data_files=\"SQuAD_it-train.json\", field=\"data\")\n",
        "print(squad_it_dataset)\n",
        "print(squad_it_dataset[\"train\"][0:5])"
      ],
      "metadata": {
        "id": "9FwmByTI5LR8"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "from datasets import load_dataset\n",
        "\n",
        "data_files = {\"train\": \"SQuAD_it-train.json\", \"test\": \"SQuAD_it-test.json\"}\n",
        "squad_it_dataset = load_dataset(\"json\", data_files=data_files, field=\"data\")\n",
        "print(squad_it_dataset)"
      ],
      "metadata": {
        "id": "28oS8G9v65wc"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "source": [
        "### 保存为jsonl"
      ],
      "metadata": {
        "id": "b-LcBSeVMs17"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "from datasets import load_dataset\n",
        "import json\n",
        "\n",
        "squad_it_dataset['train'].to_json(\"squad.jsonl\")"
      ],
      "metadata": {
        "id": "HQcritx-wHsL"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "source": [
        "### 保存为json"
      ],
      "metadata": {
        "id": "S4lt0GTpMxR9"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "squad_it_dataset.set_format(\"pandas\")\n",
        "pd =  squad_it_dataset['train'][:]\n",
        "\n",
        "pd.to_json(f\"squad-pd.json\", orient='records')\n",
        "\n",
        "squad_it_dataset.reset_format()"
      ],
      "metadata": {
        "id": "ctAMcu8xM1jV"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "source": [
        "### 加载zip"
      ],
      "metadata": {
        "id": "PDFbL2Rp_cXs"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "from datasets import load_dataset\n",
        "\n",
        "data_files = {\"train\": \"SQuAD_it-train.json.gz\", \"test\": \"SQuAD_it-test.json.gz\"}\n",
        "squad_it_dataset = load_dataset(\"json\", data_files=data_files, field=\"data\")\n",
        "\n",
        "print(squad_it_dataset)"
      ],
      "metadata": {
        "id": "-TjUb2Qn_hvE"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "source": [
        "### 加载远程数据集"
      ],
      "metadata": {
        "id": "fUbOaUPI_sfI"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "from datasets import load_dataset\n",
        "\n",
        "url = \"https://github.com/crux82/squad-it/raw/master/\"\n",
        "data_files = {\n",
        "    \"train\": url + \"SQuAD_it-train.json.gz\",\n",
        "    \"test\": url + \"SQuAD_it-test.json.gz\",\n",
        "}\n",
        "squad_it_dataset = load_dataset(\"json\", data_files=data_files, field=\"data\")\n",
        "\n",
        "print(squad_it_dataset)"
      ],
      "metadata": {
        "id": "NSW7odHC_vY6"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "source": [
        "### TSV数据集加载"
      ],
      "metadata": {
        "id": "jaMX0F-mAiv4"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "!wget \"https://archive.ics.uci.edu/ml/machine-learning-databases/00462/drugsCom_raw.zip\" -y\n",
        "!unzip drugsCom_raw.zip\n",
        "\n",
        "from datasets import load_dataset\n",
        "\n",
        "data_files = {\"train\": \"drugsComTrain_raw.tsv\", \"test\": \"drugsComTest_raw.tsv\"}\n",
        "# \\t is the tab character in Python\n",
        "drug_dataset = load_dataset(\"csv\", data_files=data_files, delimiter=\"\\t\")\n",
        "\n",
        "print(drug_dataset)\n",
        "\n",
        "drug_sample = drug_dataset[\"train\"].shuffle(seed=42).select(range(1000))\n",
        "print(drug_sample[:3])"
      ],
      "metadata": {
        "id": "soLDon2hAnDm"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "drug_dataset = drug_dataset.rename_column(\n",
        "    original_column_name=\"Unnamed: 0\", new_column_name=\"patient_id\"\n",
        ")\n",
        "print(drug_dataset)"
      ],
      "metadata": {
        "id": "c8Ur4l6xCDJn"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "def lowercase_condition(example):\n",
        "    return {\"condition\": example[\"condition\"].lower()}\n",
        "\n",
        "def filter_nones(x):\n",
        "    return x[\"condition\"] is not None\n",
        "\n",
        "drug_dataset  = drug_dataset.filter(filter_nones).map(lowercase_condition)\n",
        "\n",
        "print(drug_dataset)\n",
        "print(drug_dataset[\"train\"][:5])\n",
        "print(drug_dataset[\"train\"]['condition'][:5])\n",
        "\n"
      ],
      "metadata": {
        "id": "YX8CKOibCaTS"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "def compute_review_length(example):\n",
        "    return {\"review_length\": len(example[\"review\"].split())}\n",
        "\n",
        "drug_dataset = drug_dataset.map(compute_review_length)\n",
        "# Inspect the first training example\n",
        "print(drug_dataset[\"train\"][0])\n",
        "\n",
        "print(drug_dataset[\"train\"].sort(\"review_length\")[:3])"
      ],
      "metadata": {
        "id": "OAy89FlOD2Ue"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "drug_dataset = drug_dataset.filter(lambda x: x[\"review_length\"] > 30)\n",
        "print(drug_dataset.num_rows)"
      ],
      "metadata": {
        "id": "x0k-wk_UE72r"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "import html\n",
        "\n",
        "text = \"I&#039;m a transformer called BERT\"\n",
        "print(html.unescape(text))\n",
        "\n",
        "%time  drug_dataset = drug_dataset.map(lambda x: {\"review\": html.unescape(x[\"review\"])}, batched=True)\n",
        "print(drug_dataset)\n",
        "print(drug_dataset['train']['review'][0:5])"
      ],
      "metadata": {
        "id": "wyMIbsekFHou"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "from transformers import AutoTokenizer\n",
        "\n",
        "tokenizer = AutoTokenizer.from_pretrained(\"bert-base-cased\")\n",
        "\n",
        "\n",
        "def tokenize_function(examples):\n",
        "    return tokenizer(examples[\"review\"], truncation=True)\n",
        "\n",
        "%time tokenized_dataset = drug_dataset.map(tokenize_function, batched=True)\n",
        "\n"
      ],
      "metadata": {
        "id": "yyOcjKiDHzcX"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "slow_tokenizer = AutoTokenizer.from_pretrained(\"bert-base-cased\", use_fast=False)\n",
        "\n",
        "\n",
        "def slow_tokenize_function(examples):\n",
        "    return slow_tokenizer(examples[\"review\"], truncation=True)\n",
        "\n",
        "\n",
        "%time  tokenized_dataset = drug_dataset.map(slow_tokenize_function, batched=True, num_proc=8)"
      ],
      "metadata": {
        "id": "gkciNI6SIEbD"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "def tokenize_and_split(examples):\n",
        "    return tokenizer(\n",
        "        examples[\"review\"],\n",
        "        truncation=True,\n",
        "        max_length=128,\n",
        "        return_overflowing_tokens=True,\n",
        "    )\n",
        "\n",
        "%time result = tokenize_and_split(drug_dataset[\"train\"][0])\n",
        "print([len(inp) for inp in result[\"input_ids\"]])"
      ],
      "metadata": {
        "id": "QPgL0hRJIxoL"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "%time tokenized_dataset = drug_dataset.map( tokenize_and_split, batched=True, remove_columns=drug_dataset[\"train\"].column_names)\n",
        "\n",
        "result = len(tokenized_dataset[\"train\"]), len(drug_dataset[\"train\"])\n",
        "print(result)"
      ],
      "metadata": {
        "id": "Aom60N9MJnjl"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "def tokenize_and_split(examples):\n",
        "    result = tokenizer(\n",
        "        examples[\"review\"],\n",
        "        truncation=True,\n",
        "        max_length=128,\n",
        "        return_overflowing_tokens=True,\n",
        "    )\n",
        "    # Extract mapping between new and old indices\n",
        "    sample_map = result.pop(\"overflow_to_sample_mapping\")\n",
        "    for key, values in examples.items():\n",
        "        result[key] = [values[i] for i in sample_map]\n",
        "    return result\n",
        "\n",
        "\n",
        "%time tokenized_dataset = drug_dataset.map(tokenize_and_split, batched=True)\n",
        "print(tokenized_dataset)"
      ],
      "metadata": {
        "id": "4jbBQaw7KhuB"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "source": [
        "# Datasets 和DataFrame的互相转换"
      ],
      "metadata": {
        "id": "8fGNaVHJLE-n"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "drug_dataset.set_format(\"pandas\")\n",
        "print(drug_dataset[\"train\"][:3])\n",
        "\n",
        "train_df = drug_dataset[\"train\"][:]\n",
        "frequencies = (\n",
        "    train_df[\"condition\"]\n",
        "    .value_counts()\n",
        "    .to_frame()\n",
        "    .reset_index()\n",
        "    .rename(columns={\"index\": \"condition\", \"condition\": \"frequency\"})\n",
        ")\n",
        "frequencies.head()"
      ],
      "metadata": {
        "id": "bEcFSg4ULVe-"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "from datasets import Dataset\n",
        "\n",
        "freq_dataset = Dataset.from_pandas(frequencies)\n",
        "freq_dataset"
      ],
      "metadata": {
        "id": "NiGW3bwoMBmQ"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "drug_dataset.reset_format()"
      ],
      "metadata": {
        "id": "auWpOIIlMPNb"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "drug_dataset_clean = drug_dataset[\"train\"].train_test_split(train_size=0.8, seed=42)\n",
        "# Rename the default \"test\" split to \"validation\"\n",
        "drug_dataset_clean[\"validation\"] = drug_dataset_clean.pop(\"test\")\n",
        "# Add the \"test\" set to our `DatasetDict`\n",
        "drug_dataset_clean[\"test\"] = drug_dataset[\"test\"]\n",
        "drug_dataset_clean\n"
      ],
      "metadata": {
        "id": "ypJJxULkR72v"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "drug_dataset_clean.save_to_disk(\"drug-reviews\")"
      ],
      "metadata": {
        "id": "gAq4c8qDSBY_"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "from datasets import load_from_disk\n",
        "\n",
        "drug_dataset_reloaded = load_from_disk(\"drug-reviews\")\n",
        "drug_dataset_reloaded"
      ],
      "metadata": {
        "id": "7o-vZTRsSNpN"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "for split, dataset in drug_dataset_clean.items():\n",
        "    dataset.to_json(f\"drug-reviews-{split}.jsonl\")\n",
        "\n",
        "!head -n 3 drug-reviews-train.jsonl"
      ],
      "metadata": {
        "id": "M6Ra_VOoSbPF"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "data_files = {\n",
        "    \"train\": \"drug-reviews-train.jsonl\",\n",
        "    \"validation\": \"drug-reviews-validation.jsonl\",\n",
        "    \"test\": \"drug-reviews-test.jsonl\",\n",
        "}\n",
        "drug_dataset_reloaded = load_dataset(\"json\", data_files=data_files)\n",
        "drug_dataset_reloaded"
      ],
      "metadata": {
        "id": "KBdnoPyjSpyw"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "source": [
        "# 大文件加载"
      ],
      "metadata": {
        "id": "cZEumBcsS7CT"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "!pip install zstandard\n",
        "!pip install psutil"
      ],
      "metadata": {
        "id": "s7u-pF4uS8tQ"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "from datasets import load_dataset\n",
        "\n",
        "# This takes a few minutes to run, so go grab a tea or coffee while you wait :)\n",
        "data_files = \"https://mystic.the-eye.eu/public/AI/pile_preliminary_components/PUBMED_title_abstracts_2019_baseline.jsonl.zst\"\n",
        "pubmed_dataset_streamed = load_dataset(\"json\", data_files=data_files, split=\"train\", streaming=True)\n",
        "\n",
        "print(pubmed_dataset_streamed)\n",
        "print(next(iter(pubmed_dataset_streamed)))"
      ],
      "metadata": {
        "id": "M-ppreb9TBUf"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "\n",
        "import psutil\n",
        "\n",
        "# Process.memory_info is expressed in bytes, so convert to megabytes\n",
        "print(f\"RAM used: {psutil.Process().memory_info().rss / (1024 * 1024):.2f} MB\")\n",
        "\n",
        "print(f\"Number of files in dataset : {pubmed_dataset.dataset_size}\")\n",
        "size_gb = pubmed_dataset.dataset_size / (1024**3)\n",
        "print(f\"Dataset size (cache file) : {size_gb:.2f} GB\")"
      ],
      "metadata": {
        "id": "mM2AGnL_TYwa"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "import timeit\n",
        "\n",
        "code_snippet = \"\"\"batch_size = 1000\n",
        "\n",
        "for idx in range(0, len(pubmed_dataset), batch_size):\n",
        "    _ = pubmed_dataset[idx:idx + batch_size]\n",
        "\"\"\"\n",
        "\n",
        "time = timeit.timeit(stmt=code_snippet, number=1, globals=globals())\n",
        "print(\n",
        "    f\"Iterated over {len(pubmed_dataset)} examples (about {size_gb:.1f} GB) in \"\n",
        "    f\"{time:.1f}s, i.e. {size_gb/time:.3f} GB/s\"\n",
        ")"
      ],
      "metadata": {
        "id": "-Y7EPZZ5Tzjp"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "law_dataset_streamed = load_dataset(\n",
        "    \"json\",\n",
        "    data_files=\"https://mystic.the-eye.eu/public/AI/pile_preliminary_components/FreeLaw_Opinions.jsonl.zst\",\n",
        "    split=\"train\",\n",
        "    streaming=True,\n",
        ")\n",
        "next(iter(law_dataset_streamed))"
      ],
      "metadata": {
        "id": "aHCNnt6RUXvb"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "from datasets import load_dataset\n",
        "\n",
        "base_url = \"https://mystic.the-eye.eu/public/AI/pile/\"\n",
        "data_files = {\n",
        "    \"train\": [base_url + \"train/\" + f\"{idx:02d}.jsonl.zst\" for idx in range(30)],\n",
        "    \"validation\": base_url + \"val.jsonl.zst\",\n",
        "    \"test\": base_url + \"test.jsonl.zst\",\n",
        "}\n",
        "pile_dataset = load_dataset(\"json\", data_files=data_files, streaming=True)\n",
        "next(iter(pile_dataset[\"train\"]))"
      ],
      "metadata": {
        "id": "5nfORn13VAtK"
      },
      "execution_count": null,
      "outputs": []
    }
  ]
}